Install any packages you might be missing:
library(dplyr)
library(ggplot2)
library(leaflet)
library(sf) # handle simple features objects
library(units) # set_units() for converting units objects
library(spdep) # areal analysis
Set working directory where shapefiles are found
data.dir ="/Users/mf/Dropbox (University of Southern California)/Courses/PM569/Fall 2019/data"
Areal data are often stored as shapefiles (.shp). Shapefiles usually come in a set of at least 4 files:
.shp: main shapefile.shx: spatial index.dbf: dBase file which stores the non-spatial columns.prj: definition of the projection of the spatial dataFor example, the shapefile ca_counties.shp and its related files consist of polygons for California counties. We read shapefiles with the st_read() function from the sf package:
ca_counties = st_read(paste0(data.dir,"/shapes","/ca_counties.shp"))
## Reading layer `ca_counties' from data source `/Users/mf/Dropbox (University of Southern California)/Courses/PM569/Fall 2019/data/shapes/ca_counties.shp' using driver `ESRI Shapefile'
## Simple feature collection with 58 features and 2 fields
## geometry type: MULTIPOLYGON
## dimension: XY
## bbox: xmin: -124.482 ymin: 32.52884 xmax: -114.1312 ymax: 42.00951
## epsg (SRID): 4326
## proj4string: +proj=longlat +datum=WGS84 +no_defs
ca_counties
## Simple feature collection with 58 features and 2 fields
## geometry type: MULTIPOLYGON
## dimension: XY
## bbox: xmin: -124.482 ymin: 32.52884 xmax: -114.1312 ymax: 42.00951
## epsg (SRID): 4326
## proj4string: +proj=longlat +datum=WGS84 +no_defs
## First 10 features:
## county population geometry
## 1 Alameda 1666753 MULTIPOLYGON (((-122.2809 3...
## 2 Alpine 1101 MULTIPOLYGON (((-120.0733 3...
## 3 Amador 39383 MULTIPOLYGON (((-121.0273 3...
## 4 Butte 231256 MULTIPOLYGON (((-121.8565 3...
## 5 Calaveras 45602 MULTIPOLYGON (((-120.6309 3...
## 6 Colusa 21627 MULTIPOLYGON (((-122.0802 3...
## 7 Contra Costa 1150215 MULTIPOLYGON (((-122.2677 3...
## 8 Del Norte 27828 MULTIPOLYGON (((-124.3161 4...
## 9 El Dorado 190678 MULTIPOLYGON (((-121.1186 3...
## 10 Fresno 994400 MULTIPOLYGON (((-119.7054 3...
ca_counties %>% class()
## [1] "sf" "data.frame"
We can use leaflet to map areal data:
ca_counties %>%
leaflet() %>%
addProviderTiles('Wikimedia') %>%
addPolygons(weight=1, fillOpacity=.25, label=~county)
Shapefiles also come with data (in the .dbf) They aggregate/summarize values (e.g., population, income) over geographic divisions (e.g., countries, states, counties, census blocks, etc.).
pop.pal = colorNumeric(c('beige','brown','brown'),
domain=ca_counties$population)
ca_counties %>%
leaflet() %>%
addProviderTiles('CartoDB.Positron') %>%
addPolygons(fillColor=~pop.pal(population), weight=.5, fillOpacity=.9,
label=~paste0(county, ': ', population))
nc<-st_read(system.file("shape/nc.shp", package = "sf"), stringsAsFactors = FALSE)
## Reading layer `nc' from data source `/Library/Frameworks/R.framework/Versions/3.5/Resources/library/sf/shape/nc.shp' using driver `ESRI Shapefile'
## Simple feature collection with 100 features and 14 fields
## geometry type: MULTIPOLYGON
## dimension: XY
## bbox: xmin: -84.32385 ymin: 33.88199 xmax: -75.45698 ymax: 36.58965
## epsg (SRID): 4267
## proj4string: +proj=longlat +datum=NAD27 +no_defs
nc<-st_transform(nc,4326) # need to transform so that shapefile has WGS
nc %>% class()
## [1] "sf" "data.frame"
nc
## Simple feature collection with 100 features and 14 fields
## geometry type: MULTIPOLYGON
## dimension: XY
## bbox: xmin: -84.32385 ymin: 33.88199 xmax: -75.45698 ymax: 36.58965
## epsg (SRID): 4326
## proj4string: +proj=longlat +datum=WGS84 +no_defs
## First 10 features:
## AREA PERIMETER CNTY_ CNTY_ID NAME FIPS FIPSNO CRESS_ID BIR74
## 1 0.114 1.442 1825 1825 Ashe 37009 37009 5 1091
## 2 0.061 1.231 1827 1827 Alleghany 37005 37005 3 487
## 3 0.143 1.630 1828 1828 Surry 37171 37171 86 3188
## 4 0.070 2.968 1831 1831 Currituck 37053 37053 27 508
## 5 0.153 2.206 1832 1832 Northampton 37131 37131 66 1421
## 6 0.097 1.670 1833 1833 Hertford 37091 37091 46 1452
## 7 0.062 1.547 1834 1834 Camden 37029 37029 15 286
## 8 0.091 1.284 1835 1835 Gates 37073 37073 37 420
## 9 0.118 1.421 1836 1836 Warren 37185 37185 93 968
## 10 0.124 1.428 1837 1837 Stokes 37169 37169 85 1612
## SID74 NWBIR74 BIR79 SID79 NWBIR79 geometry
## 1 1 10 1364 0 19 MULTIPOLYGON (((-81.47276 3...
## 2 0 10 542 3 12 MULTIPOLYGON (((-81.23989 3...
## 3 5 208 3616 6 260 MULTIPOLYGON (((-80.45634 3...
## 4 1 123 830 2 145 MULTIPOLYGON (((-76.00897 3...
## 5 9 1066 1606 3 1197 MULTIPOLYGON (((-77.21767 3...
## 6 7 954 1838 5 1237 MULTIPOLYGON (((-76.74506 3...
## 7 0 115 350 2 139 MULTIPOLYGON (((-76.00897 3...
## 8 0 254 594 2 371 MULTIPOLYGON (((-76.56251 3...
## 9 4 748 1190 2 844 MULTIPOLYGON (((-78.30876 3...
## 10 1 160 2038 5 176 MULTIPOLYGON (((-80.02567 3...
plot(nc)
plot(nc[13])
sids.pal = colorNumeric(c('blue','purple','pink','yellow'),
domain=nc$SID79)
nc %>%
leaflet() %>%
addProviderTiles('CartoDB.Positron') %>%
addPolygons(fillColor=~sids.pal(SID79), weight=.5, fillOpacity=.9,
label=~paste0(NAME, ': ', BIR79)) %>%
addLegend("bottomleft", pal = sids.pal, values = ~SID79, title = "SIDS79 count", opacity = 1)
# try with sids rate
The main R package for areal analysis is spdep(). We need to create a sp object from the sf object for making connected graphs
We can then look at the connectivity of counties: shared border, nearest neighbours, and distance based.
For shared border connections, use poly2nb() and pick between queen and rook.
For k-nearest neighbours use knearneigh() to create matrix with index for the regions belonging to knn, and knn2nb() to create neighbourhood list.
For distance based connections, use dnearneigh().
# convert sf to spatial to use spdep
nc_sp <- sf:::as_Spatial(nc$geom)
# Shared border neighbours use poly2nb()
# Queen (default) and rook
sids_nb_queen<-poly2nb(nc_sp,queen=TRUE)
plot(nc_sp, main="Queen")
plot(sids_nb_queen,coordinates(nc_sp), add=TRUE,col="green")
#sharing full borders
sids_nb_rook<-poly2nb(nc_sp, queen=FALSE)
plot(nc_sp, main="Rook")
plot(sids_nb_rook,coordinates(nc_sp), add=TRUE,col="blue")
# see number of neighbours
card(sids_nb_rook)
## [1] 3 3 5 2 4 3 3 5 4 3 4 4 5 4 3 6 3 8 4 3 2 5 5 5 6 5 6 5 5 5 5 3 5 6 4
## [36] 6 6 3 9 5 4 6 6 5 2 6 6 8 6 5 7 5 6 6 5 2 6 4 4 3 6 6 7 4 5 4 8 5 5 6
## [71] 5 4 3 6 3 3 2 5 7 2 3 6 5 4 4 4 4 6 4 2 6 3 4 5 3 5 7 4 2 3
summary(card(sids_nb_queen))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.0 4.0 5.0 4.9 6.0 9.0
diffs<-diffnb(sids_nb_queen, sids_nb_rook)
plot(nc_sp, main="Difference queen, rook")
plot(diffs,coordinates(nc_sp),add=TRUE, col="red")
# k nearest neighbours
# knearneigh() creates matrix with index for the regions belonging to knn
# knn2nb() creates neighbourhood list
sids_kn1<-knn2nb(knearneigh(coordinates(nc_sp), k=1, RANN=FALSE))
sids_kn2<-knn2nb(knearneigh(coordinates(nc_sp), k=2, RANN=FALSE))
sids_kn4<-knn2nb(knearneigh(coordinates(nc_sp), k=4, RANN=FALSE))
plot(nc_sp, main="knn-1")
plot(sids_kn1, coordinates(nc_sp),add=TRUE,col="blue")
plot(nc_sp, main="knn-2")
plot(sids_kn2, coordinates(nc_sp),add=TRUE,col="green")
plot(nc_sp, main="knn-4")
plot(sids_kn4, coordinates(nc_sp), add=TRUE,col="black")
# difference between knn1 and knn2
diffs_knn<-diffnb(sids_kn1, sids_kn2)
plot(nc_sp, main="Diff knn-1, knn-2")
plot(diffs_knn, coordinates(nc_sp),add=TRUE,col="red")
# distance neighbours (still in degrees)
ndist<-unlist(nbdists(sids_kn1, coordinates(nc_sp)))
summary(ndist)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.1197 0.2582 0.2964 0.2910 0.3239 0.4226
# distance based neighbours
# median distance of knn=1 is about 30km
# creating neighbours by epsilon=30km
# d1=smallest distance, usually 0, d2=distance you want to go from the point
median_dist_kn1<-median(ndist)
sids_dist1<-dnearneigh(coordinates(nc_sp), d1=0, d2=median_dist_kn1)
sids_dist1b<-dnearneigh(coordinates(nc_sp), d1=0, d2=0.43)
plot(nc_sp, main="Distance neighbours")
plot(sids_dist1, coordinates(nc_sp), add=T,col="blue")
plot(sids_dist1b, coordinates(nc_sp), add=T,col="blue")
# 0.25 to maximum distance
max_dist_kn1<-max(ndist)
sids_dist2<-dnearneigh(coordinates(nc_sp), d1=0.25*max_dist_kn1, d2=1*max_dist_kn1)
plot(nc_sp, main="Distance neighbours")
plot(sids_dist2, coordinates(nc_sp), add=T,col="green")
sids_dist3<-dnearneigh(coordinates(nc_sp), d1=max_dist_kn1, d2=1.5*max_dist_kn1)
plot(nc_sp, main="Distance neighbours")
plot(sids_dist3, coordinates(nc_sp), add=T,col="purple")
We use the neibourhood connections to define weight matrices that link information between blocks.
We decide which neighbouhood connections we want (border, knn, or distance) and convert to weights. We use nb2listw() to convert the neighbourhood to weights.
Weights matrix can be W=row standardized, B=binary, C=globally standardized.
sids_kn2_b<-nb2listw(sids_kn2, style="B")
sids_kn2_b
## Characteristics of weights list object:
## Neighbour list object:
## Number of regions: 100
## Number of nonzero links: 200
## Percentage nonzero weights: 2
## Average number of links: 2
## Non-symmetric neighbours list
##
## Weights style: B
## Weights constants summary:
## n nn S0 S1 S2
## B 100 10000 200 352 1686
summary(unlist(sids_kn2_b$weights))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1 1 1 1 1 1
sids_kn2_w<-nb2listw(sids_kn2, style="W")
sids_kn2_w
## Characteristics of weights list object:
## Neighbour list object:
## Number of regions: 100
## Number of nonzero links: 200
## Percentage nonzero weights: 2
## Average number of links: 2
## Non-symmetric neighbours list
##
## Weights style: W
## Weights constants summary:
## n nn S0 S1 S2
## W 100 10000 100 88 421.5
summary(unlist(sids_kn2_w$weights))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.5 0.5 0.5 0.5 0.5 0.5
Neighbourhoods where there are some blocks that have no connections can be problematic. This is common when you use distance neighbourhoods (large counties may not connect to a neighbour within a smaller distance).
Make sure to use zero.policy=TRUE in these cases
sids_dist1_w<-nb2listw(sids_dist1, style="W", zero.policy = TRUE)
sids_queen_w<-nb2listw(sids_nb_queen, style="W")
sids_queen_w
## Characteristics of weights list object:
## Neighbour list object:
## Number of regions: 100
## Number of nonzero links: 490
## Percentage nonzero weights: 4.9
## Average number of links: 4.9
##
## Weights style: W
## Weights constants summary:
## n nn S0 S1 S2
## W 100 10000 100 44.65023 410.4746
summary(unlist(sids_queen_w$weights))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.1111 0.1429 0.1667 0.2041 0.2500 0.5000
sids_dist1_mm<-nb2listw(sids_dist2, style="minmax")
sids_dist1_mm
## Characteristics of weights list object:
## Neighbour list object:
## Number of regions: 100
## Number of nonzero links: 306
## Percentage nonzero weights: 3.06
## Average number of links: 3.06
##
## Weights style: minmax
## Weights constants summary:
## n nn S0 S1 S2
## minmax 100 10000 51 17 118.4444
summary(unlist(sids_dist1_mm$weights))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.1667 0.1667 0.1667 0.1667 0.1667 0.1667
sids_dist1_s<-nb2listw(sids_dist2, style="S")
sids_dist1_s
## Characteristics of weights list object:
## Neighbour list object:
## Number of regions: 100
## Number of nonzero links: 306
## Percentage nonzero weights: 3.06
## Average number of links: 3.06
##
## Weights style: S
## Weights constants summary:
## n nn S0 S1 S2
## S 100 10000 100 67.06724 428.6124
summary(unlist(sids_dist1_s$weights))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.2382 0.2917 0.2917 0.3268 0.3368 0.5834
Inverse distance weighting, take distances from knn and apply \(1/x^2\) weights.
Need to define the weights first and then include in nb2listw
dists<-nbdists(sids_kn2, coordinates(nc_sp))
# inverse distance weighted weights
idw <- lapply(dists, function(x) 1/(x/2))
sids_idw_dist_w <- nb2listw(sids_kn2, glist = idw, style = "W")
Moran’s I is a global test of association, whereby we test “is there spatial similarity in a variable given a particular adjacency matrix”?
We need to test a variable, in this case we will use SIDS rate (divide SIDS counts by birth rate).
We need an adjacency matrix, let’s choose the knn-2 that we turned into weights.
Note for these methods to get the data associated with the counties, we need the sf object (in this case, nc).
moranSIDS<-moran.test(nc$SID79/nc$BIR79,sids_kn2_w)
moranSIDS
##
## Moran I test under randomisation
##
## data: nc$SID79/nc$BIR79
## weights: sids_kn2_w
##
## Moran I statistic standard deviate = 2.4465, p-value = 0.007213
## alternative hypothesis: greater
## sample estimates:
## Moran I statistic Expectation Variance
## 0.214682382 -0.010101010 0.008442014
gearySIDS<-geary.test(nc$SID79,sids_kn2_w)
gearySIDS
##
## Geary C test under randomisation
##
## data: nc$SID79
## weights: sids_kn2_w
##
## Geary C statistic standard deviate = 2.7567, p-value = 0.002919
## alternative hypothesis: Expectation greater than statistic
## sample estimates:
## Geary C statistic Expectation Variance
## 0.68915805 1.00000000 0.01271429